Set/check knitR option and working directory

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.3.0     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes
library(adegenet)
## Loading required package: ade4
## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape
## 
##    /// adegenet 2.1.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(corrplot)
## corrplot 0.84 loaded
library(RColorBrewer)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(cluster)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_knit$set(root.dir = here())
print(paste("My working directory is:" ,here()))
## [1] "My working directory is: /home/rguerillot/Documents/Travail/Abdou_project/Staph_infection_project/github_analysis/VANANZ_phenotypes"

import function and data

source("Functions/all_functions.R")
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
median_sample_parameters_GC.df <- read.csv("Ideas_Grant_2020_analysis/Processed_data/Growth_curves/processed_median_parameters_GC.csv") %>%
  filter(strain_group != "CONTROL") %>%
  select_if(grepl("OD", names(.)) | grepl("sample_id", names(.))) %>%
  select(-end_point_timepoint_OD)

median_sample_parameters_PI.df <- read.csv("Ideas_Grant_2020_analysis/Processed_data/PI_curves/processed_median_parameters_PI.csv") %>%
  filter(strain_group != "CONTROL") %>%
  select(-end_point_timepoint_death)

median_sample_parameters_all.df <- merge(median_sample_parameters_GC.df, median_sample_parameters_PI.df, by = "sample_id") 


# correct sample metadata (mortality annotation for last isolate (not index isolate) + all isolates from survived patient = survived)
sample_meta.df <- read.csv("Ideas_Grant_2020_analysis/Raw_data/strain_metadata_corrected_mortality_with_controls.csv")

median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.)) | grepl("sample_id", names(.))) %>%
  merge(., sample_meta.df, by = "sample_id")%>%
  mutate(ST_simp = fct_lump(ST, n = 5))


median_param_dat.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.))) 

Check each variable individually

for (var in  grep("OD", colnames(median_sample_parameters_all.df), value = T)) {
  t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
    geom_density(aes(fill = mortality), alpha = .5)
  print(t)
}

for (var in  grep("death", colnames(median_sample_parameters_all.df), value = T)) {
  t<-ggplot(median_sample_parameters_all.df, aes_string(x = var))+
    geom_density(aes(fill = mortality), alpha = .5)
  print(t)
}

for (var in  grep("OD", colnames(median_sample_parameters_all.df), value = T)) {
  t <- ggviolin(data = median_sample_parameters_all.df,
          y = var,
          x = "mortality",
          fill = "mortality") +
  theme_bw() +
  theme(legend.position = "none")+
  stat_compare_means(ref.group ="Survived",
                     method = "wilcox.test",
                     label = "p.signif")
  print(t)
}

for (var in  grep("death", colnames(median_sample_parameters_all.df), value = T)) {
  t <- ggviolin(data = median_sample_parameters_all.df,
          y = var,
          x = "mortality",
          fill = "mortality") +
  theme_bw() +
  theme(legend.position = "none")+
  stat_compare_means(ref.group ="Survived",
                     method = "wilcox.test",
                     label = "p.signif")
  print(t)
}

Test vor non-homogeneity of variance between mortality groups for each variable

Rationale: It seems that for several GC PI parameters there more variance in the died group Levene test is designed to test for this http://www.sthda.com/english/wiki/compare-multiple-sample-variances-in-r

library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
for (i in 1:length(colnames(median_param_dat.df))) {
  print( paste0("Current tested variable is ", colnames(median_param_dat.df)[i]))
  print(leveneTest(median_param_dat.df[[i]], median_sample_parameters_all.df$mortality))
  print("")
  print("")
}
## [1] "Current tested variable is time_of_max_rate_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  28.141 2.595e-07 ***
##       236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is max_rate_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  35.798 8.063e-09 ***
##       236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is doubling_time_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  12.857 0.0004085 ***
##       236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is AUC_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)    
## group   1  70.381 4.5e-15 ***
##       236                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_max_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   1  7.7239 0.005888 **
##       236                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is max_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1   32.74 3.178e-08 ***
##       236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_min_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.0542 0.8161
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is min_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   1  5.8673 0.01618 *
##       236                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is end_point_OD"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  18.218 2.854e-05 ***
##       236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is AUC_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.9854 0.3219
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_max_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  1.8397 0.1763
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is max_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.8513 0.3571
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_min_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group   1  22.525 3.596e-06 ***
##       236                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is min_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value   Pr(>F)   
## group   1  6.8658 0.009356 **
##       236                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
## [1] "Current tested variable is time_of_max_rate_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  1.4615 0.2279
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is max_rate_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1  0.5142  0.474
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is doubling_time_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   1   0.033  0.856
##       236               
## [1] ""
## [1] ""
## [1] "Current tested variable is end_point_death"
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value  Pr(>F)  
## group   1  4.6914 0.03132 *
##       236                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## [1] ""
## [1] ""
fligner.test(max_rate_OD ~ mortality, data = median_sample_parameters_all.df)
## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  max_rate_OD by mortality
## Fligner-Killeen:med chi-squared = 36.209, df = 1, p-value =
## 1.773e-09

Check vriable correlation with correlogram

M <-cor(median_param_dat.df, method = "pearson")
corrplot(M, type="upper")

M <-cor(median_param_dat.df, method = "kendall")
corrplot(M, type="upper")

M <-cor(median_param_dat.df, method = "spearman")
corrplot(M, type="upper")

Scatter plot matrix

my_cols <- c("#00AFBB", "#E7B800")#, "#FC4E07")  
pairs(median_param_dat.df, pch = 19,  cex = 0.5,
      col = my_cols[median_sample_parameters_all.df$mortality],
      lower.panel=NULL)

Format parameter df for PCA

pca.df <- median_param_dat.df
row.names(pca.df) <- median_sample_parameters_all.df$sample_id

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$mortality,
             col.circle = median_sample_parameters_all.df$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$mortality,
             col.circle = median_sample_parameters_all.df$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

#ST
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$ST_simp,
             col.circle = median_sample_parameters_all.df$ST_simp ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$ST_simp,
             col.circle = median_sample_parameters_all.df$ST_simp ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Contributions of variables to PC1

fviz_contrib(pca1, choice = "var", axes = 1)

Contributions of variables to PC2

fviz_contrib(pca1, choice = "var", axes = 2)

Contribution of variables to PC1 to 4

fviz_contrib(pca1, choice = "var", axes = 1:4)

Rerun PCA with variable with max contrib.

pca.df <- median_param_dat.df %>%
#  select(AUC_death, max_death, end_point_OD, AUC_OD, max_OD, end_point_death, max_rate_death, max_rate_OD) %>%
#  select(AUC_death, AUC_OD, max_OD, max_rate_death, max_rate_OD, doubling_time_OD) %>%
  filter(!is.na(median_sample_parameters_all.df$mortality))
#row.names(pca.df) <- median_sample_parameters_all.df$sample_id

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .75,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$ST_simp ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Compute Discriminant Analysis of Principal Components (DAPC)

pca.df <- median_param_dat.df %>%
   filter(!is.na(median_sample_parameters_all.df$mortality))
  
row.names(pca.df) <- median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$sample_id

# All vs All

dapc1 <- adegenet::dapc(x = pca.df,
                        grp = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality == "Died",
                        n.pca = 6, 
                        n.da = 2, 
                        scale = T,
                        center = T)
summary(dapc1)
## $n.dim
## [1] 1
## 
## $n.pop
## [1] 2
## 
## $assign.prop
## [1] 0.8571429
## 
## $assign.per.pop
##     FALSE      TRUE 
## 0.9714286 0.5396825 
## 
## $prior.grp.size
## 
## FALSE  TRUE 
##   175    63 
## 
## $post.grp.size
## 
## FALSE  TRUE 
##   199    39
scatter.dapc(dapc1)

loadingplot(dapc1$var.contr)

dapc0 <-data.frame(comparison = "All vs ALL", 
                   var = row.names(dapc1$var.contr), 
                   contrib = as.character(dapc1$var.contr), 
                   row.names = NULL)

reconpute PCA with only best mortality discriminating variable

pca.df <- pca.df %>%
  select(time_of_max_OD, time_of_min_death, time_of_max_rate_OD, time_of_max_rate_death, max_rate_OD )

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T, center = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             col.circle = median_sample_parameters_all.df %>% filter(!is.na(mortality)) %>% .$mortality,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Same with NA mortality

pca.df <- median_param_dat.df %>%
  select(time_of_max_OD, time_of_min_death, time_of_max_rate_OD, time_of_max_rate_death, max_rate_OD )

row.names(pca.df) <- median_sample_parameters_all.df$sample_id

pca1 <- dudi.pca(df = pca.df, scannf = F, nf = 5, scale = T)

Plot % variance explained per axes

fviz_screeplot(pca1, addlabels = T)

Plot individual and variable loading with group

# mortality
fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$mortality,
             col.circle = median_sample_parameters_all.df$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

fviz_pca_biplot(pca1,
             #select.ind = list(contrib = 10),
             axes = c(1,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind =  median_sample_parameters_all.df$mortality,
             col.circle = median_sample_parameters_all.df$mortality ,
             title = paste("PCA with variable loading" ),
             #repel = T,
             pointshape = 16,
             )

Try random forest classifier

library(randomForest)


# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
  mutate(ID = row.names(.))

rf.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
  mutate(mortality = median_sample_parameters_all.df$mortality) %>%
  mutate(ID = median_sample_parameters_all.df$ID) %>%
  mutate(ST_simp =  median_sample_parameters_all.df$ST_simp)

# can filter or sample here
rf.df <- rf.df %>%
  filter(!is.na(mortality))

rownames(rf.df) <- rf.df$ID


set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]

pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp)
resp_rf.vec <- rf.df$mortality

model1 <- randomForest(x = pred_rf.df, y= resp_rf.vec, data = EM.rf.df, importance = TRUE, ntree = 100)

model1
## 
## Call:
##  randomForest(x = pred_rf.df, y = resp_rf.vec, ntree = 100, importance = TRUE,      data = EM.rf.df) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 11.34%
## Confusion matrix:
##          Died Survived class.error
## Died       41       22  0.34920635
## Survived    5      170  0.02857143
varImpPlot(model1)

# Predicting on Validation set
predValid <- predict(model1, rf.df, type = "class")
predValid <- as.data.frame(predValid) %>% 
  mutate(ID = rownames(.)) %>%
  rename(Prediction = predValid) %>%
  merge(., rf.df %>% select(ID, mortality),  by = "ID")%>%
  droplevels() %>%
  mutate(Accurate = ifelse(Prediction == mortality, T, F))

# Predict class proba
predValid_prob <- predict(model1, rf.df, type="prob")%>%
  as.data.frame(.) %>%
  mutate(ID = rownames(.)) %>%
  merge(., predValid)

# Checking classification accuracy
mean(predValid$Accurate)                    
## [1] 1
table(predValid$Prediction, predValid$mortality)
##           
##            Died Survived
##   Died       63        0
##   Survived    0      175
table(predValid$mortality, predValid$Accurate)
##           
##            TRUE
##   Died       63
##   Survived  175
# Keep n best predicted for each strain
best_pred.df <- predValid_prob %>%
  filter(Accurate == T) %>%
  mutate(Accurate_proba = pmax(Died, Survived)) %>%
  group_by(mortality) %>%
  #filter(Accurate_proba > 0.9)
  top_frac(.5, Accurate_proba)
  
count(best_pred.df, mortality)  
## # A tibble: 2 x 2
## # Groups:   mortality [2]
##   mortality     n
##   <fct>     <int>
## 1 Died         31
## 2 Survived    103

Re-compute PCA with best accurately classified

pca_meta.df3 <-  rf.df %>%
  filter(ID %in% best_pred.df$ID) 

pca.df3 <- pca_meta.df3 %>%
  select(-ID, -mortality, -ST_simp)

pca3 <- dudi.pca(df = pca.df3, scannf = F, nf = 5, scale = T)

Plot PCA

fviz_screeplot(pca3, addlabels = T)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df3$mortality,
             col.circle = pca_meta.df3$mortality,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df3$mortality,
             col.circle = pca_meta.df3$mortality,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .8,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df3$ST_simp,
             col.circle = pca_meta.df3$ST_simp,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)
## Too few points to calculate an ellipse

fviz_pca_biplot(pca3,
             #select.ind = list(contrib = 10),
             axes = c(2,3),
             geom = c("point"),
             alpha.ind = .95,
             addEllipses = T,
             #ellipse.level = .99,
             col.ind = pca_meta.df3$ST_simp,
             col.circle = pca_meta.df3$ST_simp,
             title = paste("PCA - Top 20% of accurately classified strains (Strain prototypes)" ),
             repel = T,
             pointshape = 16)
## Too few points to calculate an ellipse

Try random forest unsupervised

# add sample_id number (ID) to median_sample_parameters_all.df
median_sample_parameters_all.df <- median_sample_parameters_all.df %>%
  mutate(ID = row.names(.))

rf.df <- median_sample_parameters_all.df %>%
  select_if(grepl("OD", names(.)) | grepl("death", names(.))) %>%
  mutate(mortality = median_sample_parameters_all.df$mortality) %>%
  mutate(ID = median_sample_parameters_all.df$ID) %>%
  mutate(ST_simp =  median_sample_parameters_all.df$ST_simp) %>%
  mutate(sample_id = median_sample_parameters_all.df$sample_id)

# can filter or sample here
rf.df <- rf.df %>%
  filter(!is.na(mortality))

rownames(rf.df) <- rf.df$ID


set.seed(100)
# train <- sample(nrow(EM.rf.df), 0.7*nrow(EM.rf.df), replace = FALSE)
# TrainSet <- EM.rf.df[train,]
# ValidSet <- EM.rf.df[-train,]

pred_rf.df <- rf.df %>% select(-mortality, -ID, -ST_simp, -sample_id)
resp_rf.vec <- rf.df$mortality

model1 <- randomForest(x = pred_rf.df, data = EM.rf.df, importance = TRUE, ntree = 100)

model1
## 
## Call:
##  randomForest(x = pred_rf.df, ntree = 100, importance = TRUE,      data = EM.rf.df) 
##                Type of random forest: unsupervised
##                      Number of trees: 100
## No. of variables tried at each split: 4
varImpPlot(model1)

prox <- model1$proximity
pam.rf <- pam(prox, 2)
pred <- cbind(pam.rf$clustering, as.character(rf.df$sample_id), as.character(rf.df$ST_simp), as.character(rf.df$mortality))
table(pred[,1], pred[,2])
##    
##     BPH2700 BPH2701 BPH2702 BPH2703 BPH2704 BPH2705 BPH2706 BPH2707
##   1       1       0       0       0       0       1       1       0
##   2       0       1       1       1       1       0       0       1
##    
##     BPH2708 BPH2709 BPH2710 BPH2711 BPH2712 BPH2713 BPH2714 BPH2715
##   1       0       0       0       0       1       0       0       0
##   2       1       1       1       1       0       1       1       1
##    
##     BPH2716 BPH2717 BPH2718 BPH2719 BPH2720 BPH2721 BPH2722 BPH2723
##   1       1       1       0       0       0       1       0       0
##   2       0       0       1       1       1       0       1       1
##    
##     BPH2724 BPH2725 BPH2726 BPH2727 BPH2728 BPH2729 BPH2730 BPH2731
##   1       0       0       0       0       1       0       1       0
##   2       1       1       1       1       0       1       0       1
##    
##     BPH2732 BPH2733 BPH2734 BPH2735 BPH2736 BPH2737 BPH2738 BPH2739
##   1       1       1       0       0       0       0       0       0
##   2       0       0       1       1       1       1       1       1
##    
##     BPH2740 BPH2741 BPH2742 BPH2743 BPH2744 BPH2745 BPH2746 BPH2747
##   1       1       1       0       1       1       0       1       0
##   2       0       0       1       0       0       1       0       1
##    
##     BPH2748 BPH2749 BPH2751 BPH2752 BPH2753 BPH2754 BPH2755 BPH2756
##   1       0       1       1       1       1       1       1       1
##   2       1       0       0       0       0       0       0       0
##    
##     BPH2757 BPH2758 BPH2759 BPH2760 BPH2761 BPH2763 BPH2764 BPH2765
##   1       1       0       1       1       0       0       1       0
##   2       0       1       0       0       1       1       0       1
##    
##     BPH2766 BPH2767 BPH2768 BPH2769 BPH2770 BPH2771 BPH2773 BPH2774
##   1       0       1       1       0       0       0       0       0
##   2       1       0       0       1       1       1       1       1
##    
##     BPH2775 BPH2776 BPH2777 BPH2778 BPH2779 BPH2780 BPH2781 BPH2782
##   1       0       0       0       0       0       0       0       0
##   2       1       1       1       1       1       1       1       1
##    
##     BPH2783 BPH2784 BPH2785 BPH2786 BPH2787 BPH2788 BPH2789 BPH2790
##   1       0       0       0       1       1       0       0       0
##   2       1       1       1       0       0       1       1       1
##    
##     BPH2791 BPH2792 BPH2793 BPH2794 BPH2795 BPH2796 BPH2797 BPH2798
##   1       0       1       1       0       1       1       1       0
##   2       1       0       0       1       0       0       0       1
##    
##     BPH2799 BPH2800 BPH2801 BPH2802 BPH2803 BPH2804 BPH2805 BPH2806
##   1       1       0       1       1       1       0       1       0
##   2       0       1       0       0       0       1       0       1
##    
##     BPH2807 BPH2808 BPH2809 BPH2810 BPH2811 BPH2812 BPH2813 BPH2814
##   1       0       0       0       1       0       1       1       1
##   2       1       1       1       0       1       0       0       0
##    
##     BPH2815 BPH2816 BPH2817 BPH2818 BPH2819 BPH2820 BPH2821 BPH2822
##   1       1       0       1       1       1       1       1       1
##   2       0       1       0       0       0       0       0       0
##    
##     BPH2823 BPH2824 BPH2825 BPH2827 BPH2828 BPH2829 BPH2830 BPH2831
##   1       0       0       0       0       0       1       1       1
##   2       1       1       1       1       1       0       0       0
##    
##     BPH2832 BPH2833 BPH2834 BPH2835 BPH2836 BPH2837 BPH2838 BPH2839
##   1       0       1       0       1       1       1       1       1
##   2       1       0       1       0       0       0       0       0
##    
##     BPH2840 BPH2841 BPH2842 BPH2843 BPH2844 BPH2845 BPH2846 BPH2847
##   1       1       1       0       0       1       1       1       0
##   2       0       0       1       1       0       0       0       1
##    
##     BPH2848 BPH2849 BPH2850 BPH2851 BPH2852 BPH2853 BPH2854 BPH2855
##   1       1       1       1       0       0       1       0       0
##   2       0       0       0       1       1       0       1       1
##    
##     BPH2856 BPH2857 BPH2858 BPH2859 BPH2860 BPH2861 BPH2862 BPH2863
##   1       0       0       1       1       1       1       1       1
##   2       1       1       0       0       0       0       0       0
##    
##     BPH2864 BPH2865 BPH2866 BPH2867 BPH2896 BPH2902 BPH2904 BPH2934
##   1       1       0       0       0       0       0       1       0
##   2       0       1       1       1       1       1       0       1
##    
##     BPH2938 BPH2942 BPH2943 BPH2949 BPH2962 BPH2963 BPH2976 BPH2977
##   1       0       1       1       1       1       1       0       0
##   2       1       0       0       0       0       0       1       1
##    
##     BPH2978 BPH2979 BPH2980 BPH2981 BPH2982 BPH3225 BPH3252 BPH3276
##   1       0       0       0       0       0       0       0       0
##   2       1       1       1       1       1       1       1       1
##    
##     BPH3306 BPH3328 BPH3336 BPH3337 BPH3350 BPH3358 BPH3360 BPH3361
##   1       0       0       1       0       0       1       0       1
##   2       1       1       0       1       1       0       1       0
##    
##     BPH3367 BPH3383 BPH3384 BPH3394 BPH3396 BPH3412 BPH3416 BPH3423
##   1       0       0       0       0       0       1       1       1
##   2       1       1       1       1       1       0       0       0
##    
##     BPH3425 BPH3427 BPH3448 BPH3451 BPH3455 BPH3478 BPH3483 BPH3504
##   1       1       0       1       0       1       1       0       1
##   2       0       1       0       1       0       0       1       0
##    
##     BPH3506 BPH3515 BPH3521 BPH3525 BPH3533 BPH3535 BPH3541 BPH3549
##   1       1       0       1       1       1       0       1       0
##   2       0       1       0       0       0       1       0       1
##    
##     BPH3560 BPH3563 BPH3572 BPH3594 BPH3617 BPH3627 BPH3644 BPH3646
##   1       1       0       1       0       1       1       1       1
##   2       0       1       0       1       0       0       0       0
##    
##     BPH3676 BPH3680 BPH3681 BPH3698 BPH3706 BPH3708 BPH3712 BPH3723
##   1       1       1       1       1       0       0       1       1
##   2       0       0       0       0       1       1       0       0
##    
##     BPH3725 BPH3733 BPH3734 BPH3747 BPH3752 BPH3757
##   1       0       0       1       0       0       0
##   2       1       1       0       1       1       1
table(pred[,1], pred[,3])
##    
##      - 22 239 45  5 Other
##   1 13  1  12  9 15    62
##   2 18 21  29 11  6    41
table(pred[,1], pred[,4])
##    
##     Died Survived
##   1   32       80
##   2   31       95
cluster.df <-cbind(pam.rf$clustering, as.character(rf.df$sample_id)) %>% as.data.frame()
colnames(cluster.df) <- c("cluster", "sample_id")
  
pca_meta.df4 <-  rf.df %>%
  merge(.,  cluster.df ,  by = "sample_id") 

pca.df4 <- pca_meta.df4 %>%
  select(-sample_id, -cluster, -ST_simp, -mortality, -ID)

pca4 <- dudi.pca(df = pca.df4, scannf = F, nf = 5, scale = T)



fviz_screeplot(pca4, addlabels = T)

fviz_pca_biplot(pca4,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df4$cluster,
             title = paste("" ),
             repel = T,
             pointshape = 16)

fviz_pca_biplot(pca4,
             #select.ind = list(contrib = 10),
             axes = c(1,2),
             geom = c("point"),
             alpha.ind = .5,
             addEllipses = T,
             ellipse.level = .95,
             col.ind = pca_meta.df4$mortality,
             title = paste("" ),
             repel = T,
             pointshape = 16)